Hankowsky Homework Solutions

Basic Problems

8.28

#read in data
income <- Sleuth3::ex0828

#looking at the dataframe
head(income)
##   Subject   AFQT Educ Income2005
## 1       2  6.841   12       5500
## 2       6 99.393   16      65000
## 3       7 47.412   12      19000
## 4       8 44.022   14      36000
## 5       9 59.683   14      65000
## 6      13 72.313   16       8000
#plot the 2005 income as a function of AFQT score
income %>%
  ggplot() + 
  geom_jitter(aes(x = AFQT, y =  Income2005)) + 
  theme_classic()

#plot the 2005 income as a function of AFQT score with group means
# income %>%
#   ggplot(aes(x = AFQT, y =  Income2005)) + 
#   geom_jitter(alpha = 0.4) + 
#   stat_summary(fun = "mean", colour = "red", size = 2, geom = "point") + 
#   stat_summary(fun = "mean", colour = "red", geom = "line") + 
#   stat_summary(fun = mean,
#                geom = "errorbar",
#                fun.max = function(x) mean(x) + sd(x),
#                fun.min = function(x) mean(x) - sd(x), col = "red") + 
#   theme_classic()

#plot it again with the regression line and smooth 
income %>%
  ggplot(aes(x = AFQT, y =  Income2005)) +
  geom_jitter(alpha = 0.6) + 
  geom_smooth(method = "lm", se = F) + 
  geom_smooth(method = stats::loess, se = FALSE, col = "red", lty = 2) + 
  theme_classic()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#run the regression 
income_lm <- lm(Income2005 ~ AFQT, data = income)
summary(income_lm)
## 
## Call:
## lm(formula = Income2005 ~ AFQT, data = income)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -72004 -24075  -7815  12447 643506 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 21181.66    1925.59   11.00   <2e-16 ***
## AFQT          518.68      31.51   16.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44460 on 2582 degrees of freedom
## Multiple R-squared:  0.09496,    Adjusted R-squared:  0.09461 
## F-statistic: 270.9 on 1 and 2582 DF,  p-value: < 2.2e-16
anova(income_lm)
## Analysis of Variance Table
## 
## Response: Income2005
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## AFQT         1 5.3556e+11 5.3556e+11  270.91 < 2.2e-16 ***
## Residuals 2582 5.1044e+12 1.9769e+09                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(income_lm)
##                  2.5 %     97.5 %
## (Intercept) 17405.7989 24957.5148
## AFQT          456.8886   580.4756
#plot the diagnostics 
par(mfrow=c(2,2))
plot(income_lm)

###############################################################################

#log transform the data because of the trumpetting in the residual plot
income <- income %>%
  mutate(log_income = log(Income2005))

#plot the log(2005 income) as a function of AFQT score
income %>%
  ggplot() + 
  geom_jitter(aes(x = AFQT, y =  log_income)) + 
  theme_classic()

#plot it again with the regression line and smooth 
income %>%
  ggplot(aes(x = AFQT, y =  log_income)) +
  geom_jitter(alpha = 0.6) + 
  geom_smooth(method = "lm", se = F) + 
  geom_smooth(method = stats::loess, se = FALSE, col = "red", lty = 2) + 
  theme_classic()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#run the regression 
income_lmb <- lm(log_income ~ AFQT, data = income)
summary(income_lmb)
## 
## Call:
## lm(formula = log_income ~ AFQT, data = income)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4608 -0.3784  0.1224  0.5661  2.8794 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.868419   0.040267  245.08   <2e-16 ***
## AFQT        0.010458   0.000659   15.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9298 on 2582 degrees of freedom
## Multiple R-squared:  0.08887,    Adjusted R-squared:  0.08851 
## F-statistic: 251.8 on 1 and 2582 DF,  p-value: < 2.2e-16
anova(income_lmb)
## Analysis of Variance Table
## 
## Response: log_income
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## AFQT         1  217.71 217.706  251.84 < 2.2e-16 ***
## Residuals 2582 2232.07   0.864                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(income_lmb)
##                   2.5 %     97.5 %
## (Intercept) 9.789461406 9.94737749
## AFQT        0.009165405 0.01174977
#back-transform the estimates 
exp(income_lmb$coefficients)
##  (Intercept)         AFQT 
## 19310.793100     1.010512
exp(confint(income_lmb))
##                    2.5 %       97.5 %
## (Intercept) 17844.692528 20897.346904
## AFQT            1.009208     1.011819
#plot the diagnostics 
par(mfrow=c(2,2))
plot(income_lmb)

#plot the presentation graphic 
par(mfrow=c(1,1))
gf_jitter(log_income ~ AFQT, data = income) %>%
  gf_lm(interval = "prediction") %>%
  gf_lm(interval = "confidence", alpha = 0.5) + 
  labs(title = "Log(2005 Income) vs AFQT Score",
       y = "Log(2005 Income)", x = "AFQT Score") + 
  # geom_text(aes(7,600000, label = paste("R2 = ", signif(rsquared(income_lm), 3), "\n", 
  #                                    "Intercept =",signif(income_lm$coef[[1]], 4), "\n",
  #                                    " Slope =",signif(income_lm$coef[[2]], 4)))) + 
  theme_classic()


The residual diagnostic plot of the raw 2005 income as a function of AFQT score showed a pattern of trumpeting, so a log transformation was conducted. The diagnostics plots of the log-transformed income have no problematic trends, so the interpretation of the regression was performed with the log-transformed income data. These data provide overwhelming evidence that the 2005 income is associated with the IQ test score (AFQT score) (\(\beta\) = 1.01, p-value < 0.0001). The IQ test score explained 9% of the variance in the 2005 income (\(R^2\) = 0.088). The mean salary increases 1.01 times for each increase in AFQT scores (95% confidence intervals: 1.01 to 1.01).



#plot the 2005 income as a function of education 
income %>%
  ggplot() + 
  geom_jitter(aes(x = Educ, y =  Income2005)) + 
  theme_classic()

#plot the 2005 income as a function of education with group means
income %>%
  ggplot(aes(x = Educ, y =  Income2005)) + 
  geom_jitter(alpha = 0.4) + 
  stat_summary(fun = "mean", colour = "red", size = 2, geom = "point") + 
  stat_summary(fun = "mean", colour = "red", geom = "line") + 
  stat_summary(fun = mean,
               geom = "errorbar",
               fun.max = function(x) mean(x) + sd(x),
               fun.min = function(x) mean(x) - sd(x), col = "red") + 
  theme_classic()

#plot it again with the regression line and smooth 
income %>%
  ggplot(aes(x = Educ, y =  Income2005)) +
  geom_jitter(alpha = 0.6) + 
  geom_smooth(method = "lm", se = F) + 
  geom_smooth(method = stats::loess, se = FALSE, col = "red", lty = 2) + 
  theme_classic()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#run the regression 
income_lm2 <- lm(Income2005 ~ Educ, data = income)
summary(income_lm2)
## 
## Call:
## lm(formula = Income2005 ~ Educ, data = income)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -88660 -24121  -7686  12782 614807 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -40199.6     4865.0  -8.263 2.24e-16 ***
## Educ          6451.5      344.7  18.717  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43860 on 2582 degrees of freedom
## Multiple R-squared:  0.1195, Adjusted R-squared:  0.1191 
## F-statistic: 350.3 on 1 and 2582 DF,  p-value: < 2.2e-16
anova(income_lm2)
## Analysis of Variance Table
## 
## Response: Income2005
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Educ         1 6.7382e+11 6.7382e+11  350.33 < 2.2e-16 ***
## Residuals 2582 4.9662e+12 1.9234e+09                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(income_lm2)
##                  2.5 %     97.5 %
## (Intercept) -49739.365 -30659.786
## Educ          5775.593   7127.356
#plot the diagnostics 
par(mfrow=c(2,2))
plot(income_lm2)

###############################################################################

#plot the log(2005 income) as a function of education 
income %>%
  ggplot() + 
  geom_jitter(aes(x = Educ, y =  log_income)) + 
  theme_classic()

#plot the log(2005 income) as a function of education with group means
income %>%
  ggplot(aes(x = Educ, y =  log_income)) + 
  geom_jitter(alpha = 0.4) + 
  stat_summary(fun = "mean", colour = "red", size = 2, geom = "point") + 
  stat_summary(fun = "mean", colour = "red", geom = "line") + 
  stat_summary(fun = mean,
               geom = "errorbar",
               fun.max = function(x) mean(x) + sd(x),
               fun.min = function(x) mean(x) - sd(x), col = "red") + 
  theme_classic()

#plot it again with the regression line and smooth 
income %>%
  ggplot(aes(x = Educ, y =  log_income)) +
  geom_jitter(alpha = 0.6) + 
  geom_smooth(method = "lm", se = F) + 
  geom_smooth(method = stats::loess, se = FALSE, col = "red", lty = 2) + 
  theme_classic()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#run the regression 
income_lm2b <- lm(log_income ~ Educ, data = income)
summary(income_lm2b)
## 
## Call:
## lm(formula = log_income ~ Educ, data = income)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8671 -0.3487  0.1441  0.5772  2.6981 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 8.880995   0.103473   85.83   <2e-16 ***
## Educ        0.112067   0.007331   15.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9328 on 2582 degrees of freedom
## Multiple R-squared:  0.08299,    Adjusted R-squared:  0.08264 
## F-statistic: 233.7 on 1 and 2582 DF,  p-value: < 2.2e-16
anova(income_lm2b)
## Analysis of Variance Table
## 
## Response: log_income
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## Educ         1  203.32  203.32  233.69 < 2.2e-16 ***
## Residuals 2582 2246.46    0.87                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(income_lm2b)
##                  2.5 %    97.5 %
## (Intercept) 8.67809691 9.0838926
## Educ        0.09769147 0.1264416
#plot the diagnostics 
par(mfrow=c(2,2))
plot(income_lm2b)

#back-transform the estimates 
exp(income_lm2b$coefficients)
## (Intercept)        Educ 
## 7193.943312    1.118587
exp(confint(income_lm2b))
##                   2.5 %      97.5 %
## (Intercept) 5872.859360 8812.201552
## Educ           1.102623    1.134783
#plot the presentation graphic 
par(mfrow=c(1,1))
gf_jitter(log_income ~ Educ, data = income) %>%
  gf_lm(interval = "prediction") %>%
  gf_lm(interval = "confidence", alpha = 0.5) + 
  labs(title = "Log(2005 Income) vs Years of Education",
       y = "Log(2005 Income)", x = "Years of Education") + 
  theme_classic()


Investigation of the means of sub-group distributions shows a straight line regression would be appropriate, but the variability increases as the mean of 2005 income increases. A weighted regression would be appropriate for this situation, however, those are not covered in the Slueth book until chapter 11, so a simple linear regression was conducted. The residual diagnostic plot of the raw 2005 income as a function of years of education showed a pattern of trumpeting, so a log transformation was conducted. The diagnostics plots of the log-transformed income have no problematic trends, so the interpretation of the regression was performed with the log-transformed income data. These data provide overwhelming evidence that the 2005 income is associated with the number of years of education (\(\beta\) = 1.12, p-value < 0.0001). The number of years of education explained 8% of the variance in the 2005 income (\(R^2\) = 0.0082). The mean salary increases 1.12 times for each increase in year of education (95% confidence intervals: 1.10 to 1.13).


9.21

#read in data
deposit_feeders <- Sleuth3::ex0921

#looking at the dataframe
head(deposit_feeders)
##                 Species Weight Ingestion Organic
## 1     Hydrobia neglecta   0.20      0.57    18.0
## 2     Hydrobia ventrosa   0.20      0.86    17.0
## 3       Tubifex tubifex   0.27      0.43    29.7
## 4       Hyalella azteca   0.32      0.43    50.0
## 5 Potamopyrgus jenkinsi   0.46      2.70    14.4
## 6        Hydrobia ulvae   0.90      0.67    13.0
#transforming the data as suggested in the exercise
deposit_feeders <- deposit_feeders %>%
  mutate(log_weight = log(Weight +1), 
         log_ingestion = log(Ingestion +1), 
         log_organic = log(Organic +1))

#looking at scatterplots of the variables
ggpairs(deposit_feeders, columns = 5:7)

#run the mutiple regression 
deposit_feeders_lm <- lm(log_ingestion ~ log_weight + log_organic + log_weight:log_organic, data = deposit_feeders)
summary(deposit_feeders_lm)
## 
## Call:
## lm(formula = log_ingestion ~ log_weight + log_organic + log_weight:log_organic, 
##     data = deposit_feeders)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.92998 -0.24353 -0.00468  0.38169  0.74882 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             3.41061    0.68647   4.968 0.000168 ***
## log_weight              0.89804    0.13483   6.661  7.6e-06 ***
## log_organic            -0.95565    0.22866  -4.179 0.000806 ***
## log_weight:log_organic -0.01311    0.05683  -0.231 0.820672    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5071 on 15 degrees of freedom
## Multiple R-squared:  0.9796, Adjusted R-squared:  0.9755 
## F-statistic: 239.6 on 3 and 15 DF,  p-value: 6.847e-13
confint(deposit_feeders_lm)
##                             2.5 %     97.5 %
## (Intercept)             1.9474376  4.8737896
## log_weight              0.6106594  1.1854297
## log_organic            -1.4430189 -0.4682821
## log_weight:log_organic -0.1342289  0.1080110
#back-transform the estimates 
exp(deposit_feeders_lm$coefficients)
##            (Intercept)             log_weight            log_organic 
##             30.2838202              2.4547981              0.3845619 
## log_weight:log_organic 
##              0.9869766
exp(confint(deposit_feeders_lm))
##                            2.5 %      97.5 %
## (Intercept)            7.0107000 130.8157202
## log_weight             1.8416453   3.2720925
## log_organic            0.2362136   0.6260769
## log_weight:log_organic 0.8743899   1.1140600
#plot the diagnostics 
par(mfrow=c(2,2))
plot(deposit_feeders_lm)

#plot the presentation graphic


These data provide strong evidence that the ingestion rate of deposit feeders is associated with the percentage of organic matter in the food after accounting for the effect of species weight (p-value < 0.0001, multiple regression). These data also provide overwhelming evidence the the ingestion rate of deposit feeders is associated with the species weight after accounting for the percentage of organic matter in the food (p-value < 0.0001, multiple regression). However, there is no evidence of an interaction between the percentage of organic matter in the food and the species weight (p-value = 0.82, multiple regression).


Supplemental Problems

8.26

#read in data
mammals <- Sleuth3::ex0826

#looking at the dataframe
head(mammals)
##               CommonName                  Species   Mass Metab Life
## 1                Echidna   Tachiglossus aculeatus  2.500   302   14
## 2    Long-beaked echidna        Zaglossus bruijni 10.300   594   20
## 3               Platypus Ornithorhynchus anatinus  1.300   229    9
## 4                Opossum Lutreolina crassicaudata  0.812   196    5
## 5 South American opossum    Didelphis marsupialis  1.330   299    6
## 6       Virginia opossum     Didelphis virginiana  3.260   519    8
#transforming the data as described in the question 
mammals <- mammals %>%
  mutate(mass_3 = (Mass^(3/4)), 
         log_metab = log(Metab), 
         log_mass = log(Mass))

#plot the metabolic rate by mass 
mammals %>%
  ggplot() + 
  geom_point(aes( x = Mass, y =  Metab)) + 
  labs(title = "Metabolic Rate v Mass", 
       x = "Mass (kg)", y = "Average Basal Metabolic Rate (kJ/Day)") + 
  theme_classic()

#plot it again with the regression line and smooth 
mammals %>%
  ggplot(aes( x = Mass, y =  Metab)) +
  geom_point(alpha = 0.6) + 
  geom_smooth(method = "lm", se = F) + 
  geom_smooth(method = stats::loess, se = FALSE, col = "red", lty = 2) + 
  labs(title = "Metabolic Rate v Mass", 
       x = "Mass (kg)", y = "Average Basal Metabolic Rate (kJ/Day)") + 
  theme_classic()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# #plot the metabolic rate by mass_3
# mammals %>%
#   ggplot() + 
#   geom_point(aes( x = mass_3, y =  Metab)) + 
#   labs(title = expression("Average Basal Metabolic Rate (kJ/Day) v Mass"^(3/4)* "(kg)"), 
#        x = expression(paste(Mass^(3/4), "(kg)")), 
#        y = "Average Basal Metabolic Rate (kJ/Day)") + 
#   theme_classic()
# 
# #plot it again with the regression line and smooth 
# mammals %>%
#   ggplot(aes( x = mass_3, y =  Metab)) +
#   geom_point(alpha = 0.6) + 
#   geom_smooth(method = "lm", se = F) + 
#   geom_smooth(method = stats::loess, se = FALSE, col = "red", lty = 2) + 
#   labs(title = expression("Average Basal Metabolic Rate (kJ/Day) v Mass"^(3/4)* "(kg)"),
#        x = expression(paste(Mass^(3/4), "(kg)")), 
#        y = "Average Basal Metabolic Rate (kJ/Day)") + 
#   theme_classic()

#plot the metabolic rate by mass 
mammals %>%
  ggplot() + 
  geom_point(aes( x = log_mass, y =  log_metab)) + 
  labs(title = "Log(Metabolic Rate) v Log(Mass)", 
       x = "Log(Mass (kg))", y = "Log(Average Basal Metabolic Rate (kJ/Day))") + 
  theme_classic()

#plot it again with the regression line and smooth 
mammals %>%
  ggplot(aes( x = log_mass, y =  log_metab)) +
  geom_point(alpha = 0.6) + 
  geom_smooth(method = "lm", se = F) + 
  geom_smooth(method = stats::loess, se = FALSE, col = "red", lty = 2) + 
  labs(title = "Log(Metabolic Rate) v Log(Mass)", 
       x = "Log(Mass (kg))", y = "Log(Average Basal Metabolic Rate (kJ/Day))") + 
  theme_classic()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#run the regression 
mammals_lm <- lm(log_metab ~ log_mass, data = mammals)
summary(mammals_lm)
## 
## Call:
## lm(formula = log_metab ~ log_mass, data = mammals)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14216 -0.26466 -0.04889  0.25308  1.37616 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.63833    0.04709  119.73   <2e-16 ***
## log_mass     0.73874    0.01462   50.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4572 on 93 degrees of freedom
## Multiple R-squared:  0.9649, Adjusted R-squared:  0.9645 
## F-statistic:  2553 on 1 and 93 DF,  p-value: < 2.2e-16
anova(mammals_lm)
## Analysis of Variance Table
## 
## Response: log_metab
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## log_mass   1 533.82  533.82  2553.4 < 2.2e-16 ***
## Residuals 93  19.44    0.21                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(mammals_lm)
##                 2.5 %    97.5 %
## (Intercept) 5.5448128 5.7318485
## log_mass    0.7097121 0.7677752
#back-transform the estimates 
exp(mammals_lm$coefficients)
## (Intercept)    log_mass 
##  280.993255    2.093304
exp(confint(mammals_lm))
##                  2.5 %     97.5 %
## (Intercept) 255.906668 308.539085
## log_mass      2.033406   2.154967
#plot the diagnostics 
par(mfrow=c(2,2))
plot(mammals_lm)

#plot the presentation graphic 
par(mfrow=c(1,1))
gf_point(log_metab ~ log_mass, data = mammals) %>%
  gf_lm(interval = "prediction") %>%
  gf_lm(interval = "confidence", alpha = 0.5) + 
  labs(title = expression("Log(Average Basal Metabolic Rate (kJ/Day)) v Log(Mass (kg))"),
       y = "Log(Average Basal Metabolic Rate (kJ/Day))", 
       x = "Log(Mass (kg))") + 
  # geom_text(aes(5,5, label = paste("R2 = ", signif(rsquared(mammals_lm), 3), "\n", 
  #                                    "Intercept =",signif(mammals_lm$coef[[1]], 4), "\n",
  #                                    " Slope =",signif(mammals_lm$coef[[2]], 4)))) + 
  theme_classic()


The loess smooth of the scatterplot of the raw metabolic rate and average mass demonstrated the need for a log-transformation. A log-transformation was conducted and the simple linear regression was conducted on the log-transformed data. These data provide overwhelming evidence that the metabolic rate is associated with the average mass in mammals (\(\beta\) = 2.09, p-value < 0.0001). The average mass raised to the power of 3/4 explained 97% of the variance in the metabolic rate (\(R^2\) = 0.965). Based on these data, there is strong justification that Kleiber’s Law holds true for mammal species, however, it would not be appropriate to generalize that claim to all animal species from this dataset. The mean metabolic rate increases 2.09 times for each increase in mass (95% confidence interval: 2.03 to 2.15 times).


9.23

#read in data
incomeb <- Sleuth3::ex0923

#looking at the dataframe
head(incomeb)
##   Subject Gender   AFQT Educ Income2005
## 1       2 female  6.841   12       5500
## 2       6   male 99.393   16      65000
## 3       7   male 47.412   12      19000
## 4       8 female 44.022   14      36000
## 5       9   male 59.683   14      65000
## 6      13   male 72.313   16       8000
#looking at scatterplots of the variables
p91 <- incomeb %>%
  ggplot() + 
  geom_jitter(aes(x = Educ, y = Income2005, color = Gender, shape = Gender), 
              alpha = 0.4) + 
  labs(title = "2005 Income v Years of Education", 
       x = "Years of Education", y = "2005 Income ($)") + 
  theme_classic() +
  theme(legend.position = c(0.2, 0.85))

p92 <- incomeb %>%
  ggplot() + 
  geom_jitter(aes(x = AFQT, y = Income2005, color = Gender, shape = Gender), 
              alpha = 0.4) + 
  labs(title = "2005 Income v AFQT Scores", 
       x = "AFQT Score", y = "2005 Income ($)") + 
  theme_classic() + 
  theme(legend.position = c(0.2, 0.85))


grid.arrange(p91, p92, ncol=2)

#run the mutiple regression 
incomeb_lm <- lm(Income2005 ~ Gender + AFQT + Educ, data = incomeb)
summary(incomeb_lm)
## 
## Call:
## lm(formula = Income2005 ~ Gender + AFQT + Educ, data = incomeb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -104620  -20911   -4681   11105  604024 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -48760.12    4868.19 -10.016  < 2e-16 ***
## Gendermale   28463.29    1621.08  17.558  < 2e-16 ***
## AFQT           222.98      36.32   6.139 9.56e-10 ***
## Educ          5158.28     402.67  12.810  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41080 on 2580 degrees of freedom
## Multiple R-squared:  0.228,  Adjusted R-squared:  0.2271 
## F-statistic: 253.9 on 3 and 2580 DF,  p-value: < 2.2e-16
confint(incomeb_lm)
##                  2.5 %      97.5 %
## (Intercept) -58306.080 -39214.1570
## Gendermale   25284.530  31642.0446
## AFQT           151.762    294.1969
## Educ          4368.699   5947.8634
#plot the diagnostics 
par(mfrow=c(2,2))
plot(incomeb_lm)

###############################################################################
#log transform the data because of the trumpetting in the residual plot
incomeb <- incomeb %>%
  mutate(log_income = log(Income2005))

#looking at scatterplots of the variables
p91 <- incomeb %>%
  ggplot() + 
  geom_jitter(aes(x = Educ, y = log_income, color = Gender, shape = Gender), 
              alpha = 0.4) + 
  labs(title = "Log(2005 Income) v Years of Education", 
       x = "Years of Education", y = "Log(2005 Income ($))") + 
  theme_classic() +
  theme(legend.position = "bottom")

p92 <- incomeb %>%
  ggplot() + 
  geom_jitter(aes(x = AFQT, y = log_income, color = Gender, shape = Gender), 
              alpha = 0.4) + 
  labs(title = "Log(2005 Income) v AFQT Scores", 
       x = "AFQT Score", y = "Log(2005 Income ($))") + 
  theme_classic() + 
  theme(legend.position = "bottom")


grid.arrange(p91, p92, ncol=2)

#run the mutiple regression 
incomeb_lmb <- lm(log_income ~ Gender + AFQT + Educ, data = incomeb)
summary(incomeb_lmb)
## 
## Call:
## lm(formula = log_income ~ Gender + AFQT + Educ, data = incomeb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.0906 -0.3301  0.1404  0.5091  2.5452 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 8.7312115  0.1026287  85.076  < 2e-16 ***
## Gendermale  0.6245093  0.0341748  18.274  < 2e-16 ***
## AFQT        0.0059139  0.0007657   7.724  1.6e-14 ***
## Educ        0.0769506  0.0084888   9.065  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8661 on 2580 degrees of freedom
## Multiple R-squared:  0.2101, Adjusted R-squared:  0.2092 
## F-statistic: 228.7 on 3 and 2580 DF,  p-value: < 2.2e-16
confint(incomeb_lmb)
##                   2.5 %      97.5 %
## (Intercept) 8.529968655 8.932454398
## Gendermale  0.557496441 0.691522169
## AFQT        0.004412575 0.007415313
## Educ        0.060305047 0.093596160
#plot the diagnostics 
par(mfrow=c(2,2))
plot(incomeb_lmb)

#back-transform the estimates 
exp(incomeb_lmb$coefficients)
## (Intercept)  Gendermale        AFQT        Educ 
## 6193.226825    1.867329    1.005931    1.079989
exp(confint(incomeb_lmb))
##                   2.5 %      97.5 %
## (Intercept) 5064.287091 7573.831779
## Gendermale     1.746295    1.996753
## AFQT           1.004422    1.007443
## Educ           1.062161    1.098116
#plot the presentation graphic 
incomeb %>%
  ggplot(aes(x = Educ, y = log_income, color = Gender, shape = Gender)) + 
  geom_jitter(alpha = 0.4) + 
  geom_smooth(method = "lm", se = T) + 
  labs(title = "Effect of Gender and Years of Education on 2005 Income",
    x = "Years of Education", y = "2005 Income") + 
  theme_classic()
## `geom_smooth()` using formula 'y ~ x'


The residual diagnostic plot of the raw 2005 income as a function of AFQT score, years of education, and gender showed a pattern of trumpeting, so a log transformation was conducted. The diagnostics plots of the log-transformed income have no problematic trends, so the interpretation of the regression was performed with the log-transformed income data. There is overwhelming evidence that the mean salary for males exceeds the mean salary for females with the same years of education and AFQT scores (p-value < 0.0001, multiple regression). The mean salary for males is 1.87 times greater than the mean salary for females with the same years of education and AFQT scores (95% confidence intervals: 1.75 to 1.99).


Master Problem

library(FSA)
## ## FSA v0.9.3. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.
## 
## Attaching package: 'FSA'
## The following object is masked from 'package:mosaic':
## 
##     perc
library(car)      
## Loading required package: carData
## Registered S3 methods overwritten by 'car':
##   method       from
##   hist.boot    FSA 
##   confint.boot FSA
## 
## Attaching package: 'car'
## The following object is masked from 'package:FSA':
## 
##     bootCase
## The following objects are masked from 'package:mosaic':
## 
##     deltaMethod, logit
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following objects are masked from 'package:testthat':
## 
##     equals, is_less_than, not
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(dplyr)

#read-in data 
ruf <- read.csv("./data/RuffeSLRH.csv") 

#transforming the data
ruf <- ruf %>%
  filter(month == 7) %>%
  mutate(logW = log10(wt), 
         logL = log10(tl)) %>%
  select(-fishID, -day)
  
#looking at the dataframe
headtail(ruf)
##      year month  tl   wt      logW     logL
## 1    1988     7  78  6.0 0.7781513 1.892095
## 2    1988     7  81  7.0 0.8450980 1.908485
## 3    1988     7  82  7.0 0.8450980 1.913814
## 1936 2004     7 137 28.0 1.4471580 2.136721
## 1937 2004     7 143 31.4 1.4969296 2.155336
## 1938 2004     7 174 82.4 1.9159272 2.240549
#filter for just select years and creating seperate datasets 
ruf90 <- ruf %>%
  filter(year == 1990)

ruf9000 <- ruf %>%
  filter(year %in% c(1990,2000))

#creating the figures from chapter 7
pm1 <- ruf90 %>%
  ggplot() + 
  geom_point(aes(x = tl, y = wt)) + 
  labs(x = "Total Length (mm)", y =  "Weight (g)" ) + 
  theme_classic()

pm2 <- ruf90 %>%
  ggplot() + 
  geom_point(aes(x = logL, y = logW)) + 
  labs(x = "Log(Total Length)", y =  "Log(Weight)" ) + 
  theme_classic()

grid.arrange(pm1, pm2, ncol=2)

#fitting the regression 
ruf_lm <- lm(logW ~ logL, data = ruf90)
summary(ruf_lm)
## 
## Call:
## lm(formula = logW ~ logL, data = ruf90)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.11445 -0.02780  0.00245  0.02819  0.13892 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.91447    0.06617  -74.27   <2e-16 ***
## logL         3.02516    0.03211   94.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04287 on 144 degrees of freedom
## Multiple R-squared:  0.984,  Adjusted R-squared:  0.9839 
## F-statistic:  8874 on 1 and 144 DF,  p-value: < 2.2e-16
anova(ruf_lm)
## Analysis of Variance Table
## 
## Response: logW
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## logL        1 16.3104 16.3104  8874.3 < 2.2e-16 ***
## Residuals 144  0.2647  0.0018                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#back-transform the estimates 
exp(ruf_lm$coefficients)
##  (Intercept)         logL 
##  0.007339624 20.597373585
exp(confint(ruf_lm))
##                    2.5 %       97.5 %
## (Intercept)  0.006439816  0.008365159
## logL        19.330603541 21.947157403
##################stealing the rest of the code from his website ##############
lens <- c(100,160)                  # vector of lengths
nd <- data.frame(logL=log10(lens))  # df of log(lengths)
( plogW <- predict(ruf_lm,nd) )       # predicted log(weights)
##        1        2 
## 1.135860 1.753356
( cf <- logbtcf(ruf_lm,10) )  # correction factor
## [1] 1.004884
cf*(10^plogW)         # back-transforming with bias correction
##        1        2 
## 13.73965 56.94713
mlogW <- predict(ruf_lm,nd,interval="confidence")
cf*10^mlogW
##        fit      lwr      upr
## 1 13.73965 13.49177 13.99208
## 2 56.94713 55.43960 58.49566
plogW <- predict(ruf_lm,nd,interval="prediction")
cf*10^plogW
##        fit      lwr      upr
## 1 13.73965 11.29455 16.71406
## 2 56.94713 46.76664 69.34378
plot(logW~logL,data=ruf90,pch=19,col=rgb(0,0,0,1/4),
     ylab="log Weight (g)",xlab="log Total Length (mm)")
tmp <- range(ruf90$logL)
xs <- seq(tmp[1],tmp[2],length.out=99)
ys <- predict(ruf_lm,data.frame(logL=xs))
lines(ys~xs,lwd=2)

plot(wt~tl,data=ruf90,pch=19,col=rgb(0,0,0,1/4),
     ylab="Weight (g)",xlab="Total Length (mm)")
btxs <- 10^xs
btys <- cf*10^ys
lines(btys~btxs,lwd=2)

btys <- cf*10^predict(ruf_lm,data.frame(logL=xs),
                      interval="prediction")
head(btys,n=3)
##        fit      lwr      upr
## 1 2.251802 1.841414 2.753652
## 2 2.333299 1.908384 2.852825
## 3 2.417746 1.977784 2.955579
plot(wt~tl,data=ruf90,pch=19,col=rgb(0,0,0,1/4),
     ylab="Weight (g)",xlab="Total Length (mm)")
lines(btys[,"fit"]~btxs,col="gray20",lwd=2,lty="solid")
lines(btys[,"lwr"]~btxs,col="gray20",lwd=2,lty="dashed")
lines(btys[,"upr"]~btxs,col="gray20",lwd=2,lty="dashed")

r <- residuals(ruf_lm)
fv <- fitted(ruf_lm)

par(mfrow=c(2,2))
plot(ruf_lm)


These data provide overwhelming evidence that the weight is associated with the total length in Ruffe (p-value < 0.0001, analysis of variance F-test). There is an associated 20.6% increase in weight for each 1mm increase in total length (95% confidence interval: 19.33% to 21.95%).